Question 1: Flights at ABIA

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggmap)
library(maps)
library(mapdata)
library(tidyr)

abia <- read.csv("https://raw.githubusercontent.com/jgscott/STA380/master/data/ABIA.csv")
abia$Turnaround <- abia$ArrDelay - abia$DepDelay
abia[is.na(abia)] <- 0
abiaorigin <- filter(abia, Origin =="AUS")
abiaorigin <- abiaorigin[(abiaorigin$Dest != "DSM")&(abiaorigin$Dest != "DTW"),]

#abiadest <- filter(abia, Dest =="AUS")
airports <- read.csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat", header = FALSE)
colnames(airports) <- c("ID", "name", "city", "country", "IATA_FAA", "ICAO", "lat", "lon", "altitude", "timezone", "DST")


airportlatlon <- subset(airports, select = c(IATA_FAA, lat, lon))

abiaorigin = merge(abiaorigin, airportlatlon, by.x = "Dest", by.y = "IATA_FAA")
#abiadest = merge(abiadest, airportlatlon, by.x = "Origin", by.y = "IATA_FAA")
origindata = abiaorigin[,c("lat","lon","Turnaround","Month")]
average = aggregate(origindata[, 3], list(origindata$lat, origindata$lon), mean)
origindataturnaround = merge(origindata, average, by.x = c("lat","lon"), by.y = c("Group.1","Group.2"))

abialatlon <- filter(airportlatlon, IATA_FAA=="AUS") #separate df for abia

usa <- map_data("usa")
origin <- gather(data = origindataturnaround, -lon, -lat, -x, -Turnaround, key = "var", value = "value")
ggplot() + geom_polygon(data = usa, aes(x=long, y = lat, group = group)) + coord_fixed(1.3) + 
geom_curve(data=origin, aes(x = lon, y = lat, xend = abialatlon$lon, yend = abialatlon$lat, col = x), size = .01, curvature = .2) + 
 geom_point(data=origin, aes(x = lon, y = lat), col = "red", shape = ".") + scale_colour_gradient2()# + facet_wrap(~value, scales = "free")

Q: Is the late departure caused from the late arrival? Or did the airport contribute to make the delay time worse? What this is showing us is the number of minutes the airport contributes to a delay. If the average number is very negative, this means the airport is contributing a lot of time to the delay. Redder curves have a slower turnaround time at AUS, purple-er times have a faster turnaround time. Ie, the one flight to Des Moines had a super fast turnaround time, while the flights to Detroit on average have a slower turnaround time.

Q2:

library(tm) 
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:ggmap':
## 
##     inset
readerPlain = function(fname){
                readPlain(elem=list(content=readLines(fname)), 
                            id=fname, language='en') }

pathname= "/Users/daniellediehl/Documents/MSBA/Predictive_Modeling/STA380/data/ReutersC50"
#pathname = "C:/users/jessc/documents/STA-380-Part-2--Exercises-2/data/ReutersC50/"
train = paste(pathname,"C50train/", sep="")                 
authors = list.dirs(train, full.names = FALSE)[-1]

for(author in authors){
  nam <- paste("articles_", author, sep = "")
  file = paste(train,author,"/*.txt",sep="")
  assign(nam, Sys.glob(file))
  assign(author, lapply(eval(parse(text = nam)),readerPlain))
  nam2 <- paste("names_", author, sep="")
  assign(nam2, eval(parse(text = nam)) %>%
    { strsplit(., '/', fixed=TRUE) } %>%
    { lapply(., tail, n=2) } %>%
    { lapply(., paste0, collapse = '') } %>%
    unlist)
  assign(author,setNames(as.list(eval(parse(text = author))), eval(parse(text = nam2))))
  docs <- paste(author,"_documents", sep="")
  assign(docs, Corpus(VectorSource(eval(parse(text = author)))))
  assign(docs, tm_map(eval(parse(text = docs)), content_transformer(tolower)))
  assign(docs, tm_map(eval(parse(text = docs)), content_transformer(removeNumbers)))
  assign(docs, tm_map(eval(parse(text = docs)), content_transformer(removePunctuation)))
  assign(docs, tm_map(eval(parse(text = docs)), content_transformer(stripWhitespace)))
  assign(docs, tm_map(eval(parse(text = docs)), content_transformer(removeWords), stopwords("en")))
  dtm <- paste("dtm_",author,sep="")
  assign(dtm, DocumentTermMatrix(eval(parse(text=docs))))
  #assign(dtm, removeSparseTerms(eval(parse(text=docs)),0.95))
    }

Q3. Practice with association rule mining

Question: Pick your own thresholds for lift and confidence; just be clear what these thresholds are and how you picked them. Do your discovered item sets make sense? Present your discoveries in an interesting and concise way.

Answer:

grocery <- read.transactions('https://raw.githubusercontent.com/jgscott/STA380/master/data/groceries.txt', sep=',')
summary(grocery)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55 
##   16   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   46   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics
inspect(grocery[1:5])
##     items                     
## [1] {citrus fruit,            
##      margarine,               
##      ready soups,             
##      semi-finished bread}     
## [2] {coffee,                  
##      tropical fruit,          
##      yogurt}                  
## [3] {whole milk}              
## [4] {cream cheese,            
##      meat spreads,            
##      pip fruit,               
##      yogurt}                  
## [5] {condensed milk,          
##      long life bakery product,
##      other vegetables,        
##      whole milk}
itemFrequencyPlot(grocery, topN = 20) 

rules <- apriori(grocery, parameter=list(support=0.01, confidence=0.5, maxlen=6))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##       6  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(rules[1:5])
##     lhs                     rhs             support confidence     lift
## [1] {curd,                                                             
##      yogurt}             => {whole milk} 0.01006609  0.5823529 2.279125
## [2] {butter,                                                           
##      other vegetables}   => {whole milk} 0.01148958  0.5736041 2.244885
## [3] {domestic eggs,                                                    
##      other vegetables}   => {whole milk} 0.01230300  0.5525114 2.162336
## [4] {whipped/sour cream,                                               
##      yogurt}             => {whole milk} 0.01087951  0.5245098 2.052747
## [5] {other vegetables,                                                 
##      whipped/sour cream} => {whole milk} 0.01464159  0.5070423 1.984385
inspect(subset(rules, subset=confidence > 0.8))
summary(rules)
## set of 15 rules
## 
## rule length distribution (lhs + rhs):sizes
##  3 
## 15 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3       3       3       3       3       3 
## 
## summary of quality measures:
##     support          confidence          lift      
##  Min.   :0.01007   Min.   :0.5000   Min.   :1.984  
##  1st Qu.:0.01174   1st Qu.:0.5151   1st Qu.:2.036  
##  Median :0.01230   Median :0.5245   Median :2.203  
##  Mean   :0.01316   Mean   :0.5411   Mean   :2.299  
##  3rd Qu.:0.01403   3rd Qu.:0.5718   3rd Qu.:2.432  
##  Max.   :0.02227   Max.   :0.5862   Max.   :3.030  
## 
## mining info:
##     data ntransactions support confidence
##  grocery          9835    0.01        0.5

After reading in the grocery dataset, we first look at the item frequency plot to look at frequent items. We randomly chosen support of 0.01 and confidence of 0.5 values to start off, but we will try different values pairs to see which balance of values will result in the number of rules that will be most beneficial for our analysis.

Support = Number of transactions with both A and B / Total number of transactions=P(A∩B)

Confidence = Number of transactions with both A and B / Total number of transactions with A=P(A∩B) / P(A)

Expected Confidence = Number of transactions with B / Total number of transactions=P(B)

Lift=Confidence / Expected Confidence = P(A∩B) / P(A)*P(B)

Interactive inspect with datatable

So here we can play around with different set of values for support and confidence to see in datatable form to see different pairs for market basket analysis. When we set rules3 with support of 0.01 and confidence of 0.5, we can see 15 entries that show LHS of curd, yogurt, butter, eggs, whipped/sour cream, before buying whole milk (RHS), which makes sense as they all need to be refrigerated and therefore probably located closer together. Rules4 with support of 0.001 and confidence of 0.8 reveal much bigger datatable with 410 entries that reveal liquor, red/blush wine purchased with bottled beer which are all alcohol and make sense to buy them together and also reveals cereal as part of the pairs before buying whole milk.

As we perform more visualizations to find patterns within these baskets as well as setting different rules, we will re-adjust our confidence and support values to maximize the effectiveness of our analysis.

rules3 <- apriori(grocery, parameter=list(support=0.01, confidence=0.5))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
rules4 <- apriori(grocery, parameter=list(support=0.001, confidence=0.8))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [410 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspectDT(rules3) #if we want highest support 15 entries 
inspectDT(rules4) #if we want wider association with 410 entries

#### Visualizations

rules_sorted <- sort(rules, by='confidence', decreasing=TRUE)
#matrix representation
#plot(rules[1:20], method = 'matrix', control = list(reorder=TRUE))

#Interactive Scatterplot 
plotly_arules(rules)
#plot(rules, method = 'graph', interactive=TRUE, shading=NA)
subrules <- head(sort(rules, by='lift'),10) #Graph just 10 rules by 10 highest lifts 
plot(subrules, method='graph')

plot(rules, method='grouped') #Grouped Matrix shows LHS and RHS 

plot(subrules,method='paracoord', control=list(reorder=TRUE)) #Parallel Coordinates plot for 10 rules 

People tend to buy semi finished breads and margarine before ready soups so we should put those items close to the ready soups isle. Also, people tend to buy citrus fruit, soda, canned/bottled beer, and shopping bags before they buy margarine and ready soups. When they buy whole milk together with ready soups, they tend to buy other vegetables. Also, people tend to buy baking powder, sugar, flour, and eggs before buying margarine, which sounds like they are buying items for baking.

Getting the product recommendation rules

rules_conf <- sort(rules, by='confidence', decreasing=TRUE)
inspect(head(rules_conf)) #High-confidence rules
##     lhs                   rhs                   support confidence     lift
## [1] {citrus fruit,                                                         
##      root vegetables}  => {other vegetables} 0.01037112  0.5862069 3.029608
## [2] {root vegetables,                                                      
##      tropical fruit}   => {other vegetables} 0.01230300  0.5845411 3.020999
## [3] {curd,                                                                 
##      yogurt}           => {whole milk}       0.01006609  0.5823529 2.279125
## [4] {butter,                                                               
##      other vegetables} => {whole milk}       0.01148958  0.5736041 2.244885
## [5] {root vegetables,                                                      
##      tropical fruit}   => {whole milk}       0.01199797  0.5700483 2.230969
## [6] {root vegetables,                                                      
##      yogurt}           => {whole milk}       0.01453991  0.5629921 2.203354
rules_lift <- sort(rules, by='lift', decreasing=TRUE)
inspect(head(rules_lift)) #High lift rules 
##     lhs                   rhs                   support confidence     lift
## [1] {citrus fruit,                                                         
##      root vegetables}  => {other vegetables} 0.01037112  0.5862069 3.029608
## [2] {root vegetables,                                                      
##      tropical fruit}   => {other vegetables} 0.01230300  0.5845411 3.020999
## [3] {rolls/buns,                                                           
##      root vegetables}  => {other vegetables} 0.01220132  0.5020921 2.594890
## [4] {root vegetables,                                                      
##      yogurt}           => {other vegetables} 0.01291307  0.5000000 2.584078
## [5] {curd,                                                                 
##      yogurt}           => {whole milk}       0.01006609  0.5823529 2.279125
## [6] {butter,                                                               
##      other vegetables} => {whole milk}       0.01148958  0.5736041 2.244885

The rules with confidence of 1 imply that, whenever the LHS item was purchased, the RHS item was also purchased 100% of the time. So in our grocery rules, if one buys citrus fruit and root vegetables, there’s 58.6% chance they will buy other vegetables.

A rule with a lift of 3 implies that, the items in LHS and RHS are 3 times more likely to be purchased together compared to the purchases when they are assumed to be unrelated, which is for the same LHS-RHS pair of {citrus fruit, root vegetables} -> {other vegetables}.

Targeting Items

  • what are customers likely to buy before or after this item What are people buying before they buy margarine?

Tend to buy bottled water, eggs, and tropic fruit. Flour and tropical fruit as lhs scored higher on support and slightly less confidence, so we consider this as well when placing items on isles or for target coupon marketing.

rules <- apriori(data=grocery, parameter=list(supp=0.001, conf=0.08), appearance = list(default = 'lhs', rhs = 'margarine'), control=list(verbose=F))
rules <- sort(rules, decreasing=TRUE, by='confidence')
inspect(rules[1:5])
##     lhs                   rhs             support confidence     lift
## [1] {bottled water,                                                  
##      domestic eggs,                                                  
##      tropical fruit}   => {margarine} 0.001016777  0.4545455 7.761206
## [2] {flour,                                                          
##      tropical fruit}   => {margarine} 0.001423488  0.4375000 7.470161
## [3] {flour,                                                          
##      whole milk,                                                     
##      yogurt}           => {margarine} 0.001016777  0.4000000 6.829861
## [4] {bottled water,                                                  
##      flour}            => {margarine} 0.001016777  0.3703704 6.323945
## [5] {flour,                                                          
##      other vegetables,                                               
##      yogurt}           => {margarine} 0.001016777  0.3703704 6.323945

What are people buying after they buy margarine?

rules2 <- apriori(data=grocery, parameter=list(supp=0.01, conf=0.1), appearance = list(default = 'rhs', lhs = 'margarine'), control=list(verbose=F))
rules2 <- sort(rules2, by='confidence', decreasing=TRUE)
inspect(rules2)
##      lhs            rhs                support    confidence lift     
## [1]  {margarine} => {whole milk}       0.02419929 0.4131944  1.6170980
## [2]  {margarine} => {other vegetables} 0.01972547 0.3368056  1.7406635
## [3]  {}          => {whole milk}       0.25551601 0.2555160  1.0000000
## [4]  {margarine} => {rolls/buns}       0.01474326 0.2517361  1.3686151
## [5]  {margarine} => {yogurt}           0.01423488 0.2430556  1.7423115
## [6]  {}          => {other vegetables} 0.19349263 0.1934926  1.0000000
## [7]  {margarine} => {root vegetables}  0.01108287 0.1892361  1.7361354
## [8]  {}          => {rolls/buns}       0.18393493 0.1839349  1.0000000
## [9]  {margarine} => {bottled water}    0.01026945 0.1753472  1.5865133
## [10] {}          => {soda}             0.17437722 0.1743772  1.0000000
## [11] {margarine} => {soda}             0.01016777 0.1736111  0.9956066
## [12] {}          => {yogurt}           0.13950178 0.1395018  1.0000000
## [13] {}          => {bottled water}    0.11052364 0.1105236  1.0000000
## [14] {}          => {root vegetables}  0.10899847 0.1089985  1.0000000
## [15] {}          => {tropical fruit}   0.10493137 0.1049314  1.0000000

They tend to buy whole milk, other vegetables, rolls/buns, and yogurt after buying margarine. Whole milk and yogurt should be placed in the dairy section near margarine, so this chain association does make sense.